import json
import random
from eval_utils import Model, PROMPT

path = r'C:\Users\JHC258\Desktop\文本检索\vec_db_data\segment.json'

with open(path, 'r', encoding='utf-8') as file:
    db_data = json.load(file)['db_data']
    text = db_data['text']
    name = db_data['name']
    file_type = db_data['type']

llm = Model()
retrieval_valid = {}

i = 0
total_num = len(name)
for text, name, file_type in zip(text, name, file_type):
    i = i + 1
    if file_type == '.xlsx' or file_type == '.xls':
        sample_rate = 0.05
    elif file_type == '.docx' or file_type == '.doc':
        sample_rate = 0.5
    else:
        sample_rate = 1.0
    if random.random() <= sample_rate:
        content = PROMPT.format(content=text)
        messages = [
            {'role': 'user', 'content': content}
        ]
        res = llm.generate(messages)
        query = res.replace('Query=', '').strip()
        item = {'label': name, 'content': text, 'type': file_type}
        retrieval_valid[query] = item
        print(f'{i}/{total_num}: {res}')
        print('_' * 45)
        print(text)
        print('_' * 99)

json_path = r'C:\Users\JHC258\Desktop\文本检索\evaluate\valid_set\text2query.json'
with open(json_path, 'w+', encoding='utf-8') as f:
    json.dump(retrieval_valid, f, ensure_ascii=False)
